class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#nbi:hide_in
import torch
import data_loader.data_loaders as module_data
from data_loader.data_loaders import NormDataLoader
import model.model as module_arch
from utils import get_instance
import math
import librosa
from IPython.display import display, Audio
from tqdm import tqdm
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
model_path = '/data/yinjyun/projects/gmvae-timbre/gmvae_v2-timbre/saved/Sol_GMVAE_v2/melspec_256-first_chunk-include_onset-fix_piano-normalize-ss_1.0-latent_16-pow_0-lvl_0-pc_1-pd_1-val_lower_bound/model_best.pth'
sr = 22050
n_fft = 2048
n_mel = 256
hop_length = 256
fmin = 27
fmax = 11000
def griffinlim(spectrogram, n_iter = 100, window = 'hann', n_fft = 2048, hop_length = -1, verbose = False):
if hop_length == -1:
hop_length = n_fft // 4
angles = np.exp(2j * np.pi * np.random.rand(*spectrogram.shape))
t = tqdm(range(n_iter), ncols=100, mininterval=2.0, disable=not verbose)
for i in t:
full = np.abs(spectrogram).astype(np.complex) * angles
inverse = librosa.istft(full, hop_length = hop_length, window = window)
rebuilt = librosa.stft(inverse, n_fft = n_fft, hop_length = hop_length, window = window)
angles = np.exp(1j * np.angle(rebuilt))
if verbose:
diff = np.abs(spectrogram) - np.abs(rebuilt)
t.set_postfix(loss=np.linalg.norm(diff, 'fro'))
full = np.abs(spectrogram).astype(np.complex) * angles
inverse = librosa.istft(full, hop_length = hop_length, window = window)
return inverse
def denormalize(S, d_min, d_max):
S = ((S + 1) / (2)) * (d_max - d_min) + d_min
S = np.exp(S)
return S
dl_norm = NormDataLoader(data_dir="/data/yinjyun/datasets/sol/acidsInstruments-ordinario/data/melspec_256-first_chunk-include_onset-fix_piano/",
batch_size=512, validation_split=0.1, shuffle=True, num_workers=0)
d_max = -math.inf
d_min = math.inf
for i, (data, target, idx) in enumerate(dl_norm):
if data.max().item() > d_max:
d_max = data.max().item()
if data.min().item() < d_min:
d_min = data.min().item()
mel_filter = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mel, fmin=fmin, fmax=fmax)
resume = torch.load(model_path)
config = resume['config']
n_class = config['arch']['args']['n_class']
latent_dim = config['arch']['args']['latent_dim']
batch_size = config['data_loader']['args']['batch_size']
model = get_instance(module_arch, 'arch', config)
# model.summary()
model.load_state_dict(resume['state_dict'])
model.eval()
model.is_featExtract = False
data_loader = get_instance(module_data, 'data_loader', config)
valid_data_loader = data_loader.split_validation()
pitch_map = data_loader.dataset.pitch_map
pitchclass_map = data_loader.dataset.pitchclass_map
octave_map = {str(k): v for k, v in enumerate(range(8))}
inv_pitch_map = {v: k for k, v in pitch_map.items()}
inv_pitchclass_map = {v: k for k, v in pitchclass_map.items()}
desired_ins_key = ['Ehn', 'Fhn', 'Trtb', 'Trop', 'Pno', 'Vn', 'Vc', 'Sax', 'Bn', 'Clr', 'Fl', 'Ob']
ins_map = {i: n for n, i in enumerate(desired_ins_key)}
inv_ins_map = {v: k for k, v in ins_map.items()}
def control_syn(octave, pitchclass, instrument):
plt.close()
pitch = inv_pitchclass_map[pitchclass] + str(octave)
print(pitch, inv_ins_map[instrument])
t_id, p_id = instrument, pitch_map[pitch]
t_id, p_id = torch.tensor(t_id), torch.tensor(p_id)
mu_t = model.mu_lookup(t_id).data.unsqueeze(0)
logvar_t = model.logvar_lookup(t_id).data.unsqueeze(0)
mu_p = model.pitch_mu_lookup(p_id).unsqueeze(0)
logvar_p = model.pitch_logvar_lookup(p_id).unsqueeze(0)
z_t = model._infer_latent(mu_t, logvar_t, weight=1)[2]
z_p = model._infer_latent(mu_p, logvar_p, weight=1)[2]
S = model._decode(torch.cat([z_t, z_p], dim=1)).data.numpy().squeeze(0)
S = denormalize(S, d_min, d_max)
plt.imshow(librosa.power_to_db(S), aspect='auto', origin='lower')
S_stft = np.dot(mel_filter.T, S)
x = griffinlim(S_stft, n_iter=50, n_fft=n_fft, hop_length=hop_length)
display(Audio(x, rate=sr))
# from ipywidgets import interact
# interact(control_syn, octave=octave_map, pitchclass=pitchclass_map, instrument=ins_map)?
We generate the Mel-spectrograms as described in the paper, and use Griffin-Lim to synthesize the waveforms. Note that we do not focus on good audio quality in this paper, and the inferior quality is mainly due to the algorithm used to synthesize the waveforms (as the original Mel-spectrograms and the generated ones result in the similar audio quality). We will address this, in the future work, by using advanced auto-regressive networks such as wavenets for audio synthesis.
Firstly, we present the audio that is synthesized using the original Mel-spectrogram. Specifically, we convert a piano sample to Mel-spectrogram, and resynthesize back to audio using Griffin-Lim. This is to give a reference of the audio quality obtained by Griffin-Lim.
Now we demonstrate the controllable sound synthesis. As described in the paper Section 4.3, we specify the target pitch $\mathbf{y_m}$ and instrument $\mathbf{y_k}$, and sample the pitch code $\mathbf{z}_p$ and timbre code $\mathbf{z}_t$ from the conditional distribution $p(\mathbf{z}_p | \mathbf{y_m})$ and $p(\mathbf{z}_t | \mathbf{y_k})$, respectively, where $p(\mathbf{z}_{p} | \mathbf{y}_{p}) = \mathcal{N}(\mathbf{\mu}_{\mathbf{y}_{p}}, \textrm{diag}(\mathbf{\sigma}_{\mathbf{y}_{p}}))$ and $p(\mathbf{z}_{t} | \mathbf{y}_{t}) = \mathcal{N}(\mathbf{\mu}_{\mathbf{y}_{t}}, \textrm{diag}(\mathbf{\sigma}_{\mathbf{y}_{t}}))$. In the following demonstration, we specify the same pitches for all instruments, play the audio and display the corresponding Mel-spectrograms.
As described in Section 4.4 in the paper, we first infer $\mathbf{z}_p$ and $\mathbf{z}_t$ of the source input, and modify $\mathbf{z}_t$ (denoted as $\mathbf{z}_{source}$) by:
$$\mathbf{z}_{transfer} = \mathbf{z}_{source} + \alpha\mathbf{\mu}_{source \rightarrow target},$$
where $\mathbf{\mu}_{source \rightarrow target} = \mathbf{\mu}_{target} - \mathbf{\mu}_{source}$, and $\alpha \in [0, 1]$. We then synthesize the spectrogram by passing $[\mathbf{z}_p, \mathbf{z}_{transfer}]$ to the decoder. See Fig. 4 for an illsutration of transferring French horn to piano.
Note that, in practice, we do not need labels of source instrument and pitch for timbre transfer, as the two variables are automatically inferred by $q(\mathbf{z}_p | \mathbf{X})$ and $q(\mathbf{z}_t | \mathbf{X})$, respectively. $q(\mathbf{y}_{t} | \mathbf{X})$ infers the mixture component (source instrument identity) to which $\mathbf{X}$ belongs, and $\mathbf{\mu}_{source \rightarrow target}$ is then obtained by subtracting mean of the mixture component of the target to the that of the source.
#nbi:hide_in
import pandas as pd
pitchclass_map = data_loader.dataset.pitchclass_map
dyn_map = data_loader.dataset.dynamic_map
inv_pitchclass_map = {v: k for k, v in pitchclass_map.items()}
inv_dyn_map = {v: k for k, v in dyn_map.items()}
def call_inv_dyn(x):
return inv_dyn_map[x]
def call_inv_pitchclass(x):
return inv_pitchclass_map[x]
def call_inv_pitch(x):
return inv_pitch_map[x]
def call_inv_ins(x):
return inv_ins_map[x]
def get_octave(x):
return x[-1]
def get_octave_range(x):
if int(x) <= 4:
return 'low'
else:
return 'high'
torch.manual_seed(123)
with torch.no_grad():
# for i, (data, target, data_ind) in enumerate(data_loader):
for i, (data, target, data_ind) in enumerate(valid_data_loader):
y_ins, y_pitch_class, y_pitch, y_dyn = target[0], target[1], target[2], target[3]
n_band, context_size = data.size(2), data.size(3)
data = data.view(-1, n_band, 1, context_size).squeeze(2)
x_predict, mu, logvar, z, log_q_y_logit, q_y, ind, pitch_mu, pitch_logvar, pitch_z, pitch_logit\
= model(data)
if i == 0:
idx_all = data_ind
assign = ind
d_all = data
z_all = z
logvar_all = logvar
ins_gt = y_ins
pitch_gt = y_pitch
pitchclass_gt = y_pitch_class
dyn_gt = y_dyn
x_predict_all = x_predict
if model.is_pitch_condition:
pitch_z_all = pitch_z
else:
idx_all = torch.cat([idx_all, data_ind])
assign = torch.cat([assign, ind])
d_all = torch.cat([d_all, data], dim=0)
z_all = torch.cat([z_all, z], dim=0)
logvar_all = torch.cat([logvar_all, logvar], dim=0)
ins_gt = torch.cat([ins_gt, y_ins])
pitch_gt = torch.cat([pitch_gt, y_pitch])
pitchclass_gt = torch.cat([pitchclass_gt, y_pitch_class])
dyn_gt = torch.cat([dyn_gt, y_dyn])
x_predict_all = torch.cat([x_predict_all, x_predict], dim=0)
if model.is_pitch_condition:
pitch_z_all = torch.cat([pitch_z_all, pitch_z], dim=0)
d = {
'ins': ins_gt.data.numpy(),
'pitch': pitch_gt.data.numpy(),
'pitchclass': pitchclass_gt.data.numpy(),
'dyn': dyn_gt.data.numpy(),
'assign': assign.data.numpy()
}
df = pd.DataFrame(d)
df['dyn_inv'] = np.vectorize(call_inv_dyn)(df['dyn'])
df['pitchclass_inv'] = np.vectorize(call_inv_pitchclass)(df['pitchclass'])
df['pitch_inv'] = np.vectorize(call_inv_pitch)(df['pitch'])
df['ins_inv'] = np.vectorize(call_inv_ins)(df['ins'])
df['octave'] = np.vectorize(get_octave)(df['pitch_inv'])
df['octave_range'] = np.vectorize(get_octave_range)(df['octave'])
df.head()
def recognize_family(x):
if x in [0, 1, 2, 3]:
y = 'brass'
elif x in [5, 6]:
y = 'string'
elif x == 4:
y = 'piano'
else:
y = 'wind'
return y
df['family'] = np.vectorize(recognize_family)(df['ins'])
df.groupby(['family', 'ins_inv']).size()
# pins.groupby(['instrument', 'pitch']).size().unstack(fill_value=0)
#nbi:hide_in
source_ins = 'Pno'
target_ins = 'Vc'
desired_octave = 4
def timbre_transfer(source_ins, target_ins, desired_pitches=None, n_sample=None):
mu_target = mu_prior[ins_map[target_ins]]
mu_source = mu_prior[ins_map[source_ins]]
mu_s2t = (mu_target - mu_source).unsqueeze(0)
source_idx = np.where((df['ins_inv'] == source_ins))[0]
filter_pitch, filter_dyn, filter_int = np.array(df['pitch_inv'][source_idx]), np.array(df['dyn_inv'][source_idx]), np.array(df['ins_inv'][source_idx])
alpha = [0.25, 0.5, 0.75, 1.0]
if not n_sample:
n_sample = 2
else:
n_sample = len(source_idx)
n_col = len(alpha) + 1
np.random.seed(111)
sample_idx = np.random.choice(source_idx, n_sample, replace=False)
if desired_pitches:
sample_idx = [i for i in sample_idx if df['pitch_inv'][i] in desired_pitches]
n_row = len(sample_idx)
fig, ax = plt.subplots(n_row, n_col, figsize=(n_col * 2, n_row * 2), sharey=True, sharex=True)
for n, i in enumerate(sample_idx):
#target_ins_idx = np.where((df['ins_inv'] == target_ins) &
# (df['pitch_inv'] == df['pitch_inv'][i]))[0]
zp_source = pitch_z_all[i].unsqueeze(0)
zt_source = z_all[i]
s_source = x_predict_all[i].data.numpy()
s_source = denormalize(s_source, d_min, d_max)
ax[n][0].imshow(librosa.power_to_db(s_source), aspect='auto', origin='lower')
if n == 0:
ax[n][0].set_title(r'$\alpha = 0$', fontsize=15)
ax[n][0].set_ylabel('%s %s'% (source_ins, df['pitch_inv'][i]), rotation=360, fontsize=15)
ax[n][0].yaxis.set_label_coords(-0.5, 0.5)
x_source = griffinlim(np.dot(mel_filter.T, s_source), n_iter=50, n_fft=2048, hop_length=hop_length)
print(df['pitch_inv'][i], df['dyn_inv'][i], df['ins_inv'][i])
display(Audio(x_source, rate=22050))
ax[n][0].imshow(librosa.power_to_db(s_source), aspect='auto', origin='lower')
for k, a in enumerate(alpha):
z_s2t = zt_source + a * mu_s2t
s_transfer = model._decode(torch.cat([z_s2t, zp_source], dim=1)).squeeze(0).data.numpy()
s_transfer = denormalize(s_transfer, d_min, d_max)
ax[n][k + 1].imshow(librosa.power_to_db(s_transfer), aspect='auto', origin='lower')
if n == 0:
ax[n][k + 1].set_title(r'$\alpha = %.2f$' % a, fontsize=15)
if k == len(alpha)-1:
ax2 = ax[n][k + 1].twinx()
ax2.set_ylabel(target_ins, rotation=360, fontsize=15)
ax2.yaxis.set_label_coords(1.33, 0.6)
ax2.set_yticks([])
x_transfer = griffinlim(np.dot(mel_filter.T, s_transfer), n_iter=50, n_fft=2048, hop_length=hop_length)
display(Audio(x_transfer, rate=22050))
plt.tight_layout()
#plt.savefig('/data/yinjyun/projects/gmvae-timbre/gmvae_v2-timbre/saved/Sol_GMVAE_v2/transfer_p2c.eps', format='eps', dpi=50)
Following Fig. 5 in the paper, we demonstrate $\texttt{Fhn} \rightarrow \texttt{Pno}$, $\texttt{Pno} \rightarrow \texttt{Vc}$, $\texttt{Vc} \rightarrow \texttt{Bn}$, and $\texttt{Bn} \rightarrow \texttt{Fhn}$.